library(Microbiome)
library(phyloseq)

1 Introduction

This package is based on the code of Dr. Villette and Dr. Larsen. The package is written and maintained by Dr. Villette. This package is meant to facilitate microbiome exploration and ensuring nice plotting.
This package covers :

  • The dada2 pipeline with wrapper functions that ease the processing of multiple projects

  • Some plotting functions for beta diversity, heatmap and differential abundance analysis using directly a phyloseq object

  • A pipeline for IgASeq analysis

2 Trim, denoise and align your sequences using dada2 pipeline

For convenience this will not be a reproducible example, dada2 takes too long to compute and knit. This part of the tutorial will present a run that we performed in house. The rest of the tutorial will be based on reproducible data.

2.1 Get the files

We use here a wrapper function that will create a list of three for pair end :

  • forward files

  • reverse files

  • names of the files

And for single end : -

  • a list of files

  • names of the files


f_list = list_fastq("/home/bigbeast/Documents/tmp/2022-12 CIMMAP run ELISE", 
                    pattern = c("R1", "R2"), separator = "_", level = 1)

# check that all files are distinct

lapply(f_list, duplicated)

# check that all files exist5
lapply(f_list, file.exists)
random=  lapply(f_list, "[",sample(1:255,30))
# tmp= summarise_fastq(random, cores =30, plot = F )

2.2 Check the quality profile

We will use the function qc_check. This will take time as the plotQualityProfile isn’t parallelized in dada2. This function will create two plots (for pair end) of n aggregated samples and only one plot if you are using single end.

qc_check(flist, n=30)

2.3 Check the best trimming parameters for your set of fastq

2.3.1 Test optimal cutting param

set.seed(1)
tmp = lapply(f_list, "[",sample(1:255,30))
tmp_list= filt_list(tmp)
filtered= list()

cb= combn(x= (300-seq(0, 50, by=10)), m=2)
# Very long be careful
#try to test combination of trimmings
for(i in 1:dim(cb)[2]){
  filtered[[i]]= filter_fastq(tmp, tmp_list, cutting_param = cb[ ,i], cores = 35, trimleft = 35, maxEE = c(3,4))
}
#extract the data to a df 
per= NULL
for(i in 1:15) {
t =  filtered[[i]] %>% 
  as.data.frame() %>%
  mutate(per=reads.out/reads.in)
per= cbind(per, t$per) %>% 
  as.data.frame()
# per$names=rownames(tmp[1])
print(per)
}

colnames(per)= paste(cb[1,], cb[2,]) 
per$names=rownames(t[1])

# plot 

png("transmic and IgA rescue mice percent reads passed depending on cutting parameters.png", width = 600, height=400)
per %>% 
  as.data.frame() %>%
  pivot_longer(names_to = "cut", values_to = "per", cols = 1:15) %>% 
  ggplot(aes(y=per, x= cut))+
  geom_boxplot()+ geom_line(aes(group=names, col=names))+
  labs(y= "percentage passed", x= "Trimming parameters")+
  theme(axis.text.x = element_text(angle=90), legend.position = "none")
 

dev.off()

2.3.2 Test optimal maxEE

cb= combn(x= (7-seq(0,5, by=1)), m=2)
cb =  cb[nrow(cb):1,]
# Very long be careful
#try to test combination of trimmings
# registerDoParallel(cl = makeCluster(5))
for(i in 1:dim(cb)[2]){
  filtered[[i]]= filter_fastq(tmp, tmp_list, cutting_param = c(260,250),
                              cores = 35, trimleft = 35, maxEE = cb[ ,i])
}
#extract the data to a df 
per= NULL
for(i in 1:15) {
t =  filtered[[i]] %>% 
  as.data.frame() %>%
  mutate(per=reads.out/reads.in)
per= cbind(per, t$per) %>% 
  as.data.frame()
}

colnames(per)= paste(cb[1,], cb[2,]) 
per$names=rownames(t[1])

# plot 

png("transmic and IgA rescue mice percent reads passed depending on errors parameters.png", width = 600, height=400)
per %>% as.data.frame() %>% pivot_longer(names_to = "cut", values_to = "per", cols = 1:15) %>% 
  ggplot(aes(y=per, x= cut))+
  geom_boxplot()+ geom_line(aes(group=names, col=names))+
  labs(y= "percentage passed", x= "Trimming parameters")+
  theme(axis.text.x = element_text(angle=90), legend.position = "none")
dev.off()

2.4 Filter and trim the fastq

We now have to remove the bad quality reads and trim the length. You will find a function to create the list of filtered files and one to make the filtered files.

filt= filt_list(f_list) # create the list of filtered files

filtered = filterAndTrim(fwd= fwd, filt = filtFs, rev=rv, filt.rev = filtRs, truncLen = c(260,240), trimLeft = 25, maxEE = c(3,5), multithread = 45)

3 Analyse your 16S data now

We will use the enterotype data to explore some of the plotting functions. Let’s start with the beta diversity functions beta_diversity and beta_dispersion.

data(enterotype)

3.1 Alpha diversity plots

TBD

3.2 Beta diversity plots

You will have the choice between beta_diversity and beta_dispersion for your beta diversity plotting.

will plot any of the following methods : PCoA, NMDS, PCA, BCA. More methods will be implemented later on. You will end up with a two dimensions projected using dots and ellipses. The ellipses here are not confidence ellipses but a graphical summary.

will plot only PCoA for the moment, I need to work around some errors to implement more methods. This function will plot the two components of your choosing, confidence ellipses and boxplot for each axis and for each group. Each function will return a plot and a percentage of contribution for each component.

beta_diversity(enterotype, dist="bray", method="NMDS", group="SeqTech", permanova = F)
#> Run 0 stress 0.1380077 
#> Run 1 stress 0.1559771 
#> Run 2 stress 0.1457331 
#> Run 3 stress 0.1549686 
#> Run 4 stress 0.152527 
#> Run 5 stress 0.1528008 
#> Run 6 stress 0.1466665 
#> Run 7 stress 0.1509119 
#> Run 8 stress 0.1532556 
#> Run 9 stress 0.1518128 
#> Run 10 stress 0.1470609 
#> Run 11 stress 0.1578287 
#> Run 12 stress 0.1586562 
#> Run 13 stress 0.1544834 
#> Run 14 stress 0.1536689 
#> Run 15 stress 0.1531361 
#> Run 16 stress 0.1466797 
#> Run 17 stress 0.1485687 
#> Run 18 stress 0.15547 
#> Run 19 stress 0.1526176 
#> Run 20 stress 0.1490201 
#> *** Best solution was not repeated -- monoMDS stopping criteria:
#>     16: stress ratio > sratmax
#>      4: scale factor of the gradient < sfgrmin


beta_dispersion(enterotype, dist = "bray", method = "PCoA", group = "SeqTech")

#> [1] 45.416387 11.676605  7.782120  3.613679  3.195623

You can play with the parameters of each function like the following plots :

beta_diversity(enterotype, dist="bray", method="PCoA", group="SeqTech",
               color_vector =c("#777711", "#117777", "#DD7788"),
               factor_to_plot = "Sequencing technologies \n impact 16S results", 
               lwd = 2, cpoint = 2, permanova = T)


beta_dispersion(enterotype, dist = "bray", method = "PCoA", group = "SeqTech",
                color_vector = c("#777711", "#117777", "#DD7788"),
                legend_title = "Sequencing tech", lwd = 2, 
                font = 2, draw = "polygon", text = T, permanova = T, 
                y.intersp = 0.7)

#> [1] 45.416387 11.676605  7.782120  3.613679  3.195623

3.3 Heatmap based on the phyloseq object

This function will perform a top taxa at the rank you choose and create a heatmap with annotations. For now only one annotation is supported.
The clusterisation is made using the hclustfunction and a Ward.D2 method. You can define the distance matrix that you want to use. It is important to note that the distance is made before any trimming of the data. This means that the distance matrix is made at the ASV/OTU level before doing the rank merging and topping, so the clusterization will represent your “true” data instead of a modified dataset.

phylo_heatmap(enterotype, top = 30, labels = "SeqTech", taxa_rank = "Genus", factor_to_plot ="Enterotype", split = 3, distance = "bray" )
#> [1] "No phylogenetic tree in this phyloseq object, bray-curtis distance selected."
#> [1] " not reversed"

The idea behind these functions is : creating a more automatic pipeline enabling filtering and subseting of the phyloseq object without having to perform a temporary phyloseq object and a temporary distance object. With these function you can directly use the “pipe” introduce by magrittr and dplyr.
So you can use a subset_samples like the following and automatically plot your beta diversity without adding to much code.

enterotype %>%
  subset_samples(SeqTech=="Illumina") %>% 
  beta_dispersion(group="Enterotype", color_vector = c("#777711", "#117777", "#DD7788"), 
                  legend_title = "Enterotypes \n with bray" ,
                  lwd = 2, font = 2, draw = "lines", text = T, permanova = T, y.intersp = 0.7, 
                  where="bottomleft", cex = 3)

#> [1] 31.826209 27.601606  6.621976  4.908661  4.022113

Additionnaly, if you want to go further you can also serialized the code with a for loop or a lapply or even a parallel approach using mclapply or doParallel.

layout(matrix(c(1,2,3),
                nrow = 1,
                ncol = 3,
                byrow = TRUE))
for(i in c("Illumina", "Sanger", "Pyro454")){
  enterotype %>%
    subset_samples(SeqTech==i & !is.na(Enterotype))%>%
    beta_diversity(dist="bray", method="PCoA", group="Enterotype", 
                   color_vector =c("#777711", "#117777", "#DD7788"), 
                   factor_to_plot = paste("Enterotypes using \n",i, "sequencing"), lwd = 2, cpoint = 2)
}

3.4 Differential abundance testing

data("GlobalPatterns")
tmp= GlobalPatterns %>%
  subset_samples(SampleType=="Feces" | SampleType=="Soil")%>%
  tax_glom("Genus")
res= tmp %>% 
  differential_abundance( group="SampleType", col1 = "brown", col2="darkgreen")

3.4.1 Results

head(res$all_features)
#>                                  we.ep     we.eBH      wi.ep    wi.eBH
#> Nitrosopumilus 3           0.031241004 0.11488351 0.06651786 0.1665080
#> CandidatusNitrososphaera 4 0.081914361 0.21867567 0.05714286 0.1529735
#> Methanocorpusculum 15      0.383904745 0.51762444 0.52120536 0.6272339
#> Methanobacterium 17        0.214928167 0.35040933 0.24575893 0.3708109
#> Methanobrevibacter 19      0.005537288 0.04470355 0.05714286 0.1529735
#> Propionibacterium 20       0.584185912 0.69804707 0.69017857 0.7791258
#>                               rab.all rab.win.Feces rab.win.Soil   diff.btw
#> Nitrosopumilus 3            0.1749800      1.962080   -3.8218057  -6.312766
#> CandidatusNitrososphaera 4  3.0514559      1.871025    6.8986851   5.456450
#> Methanocorpusculum 15      -0.8226364     -1.387899   -0.1459434   1.486529
#> Methanobacterium 17        -1.7929303      0.465147   -3.6692233  -3.733588
#> Methanobrevibacter 19       5.4898365      9.181054   -3.7474231 -13.175678
#> Propionibacterium 20       -1.8642525     -1.269702   -3.1182893  -0.852597
#>                            diff.win     effect      overlap
#> Nitrosopumilus 3           3.341048 -1.8061270 0.0156427220
#> CandidatusNitrososphaera 4 3.157491  1.8713503 0.0003653998
#> Methanocorpusculum 15      3.062627  0.3491589 0.3056999939
#> Methanobacterium 17        3.926509 -0.8660320 0.1450793206
#> Methanobrevibacter 19      4.816949 -2.8699609 0.0003653998
#> Propionibacterium 20       5.150261 -0.1925011 0.4145079690

3.4.2 Barplot results

res$barplot

3.4.3 Volcano plot results

res$volcano

TBD

3.5 Model testing

Last but not least : machine learning. Machine learning is used in general to predict outcome or predict class assignation between health and disease status. For this purpose we developed wrapper functions to screen models more easily. Most of the code is actually generated using caret [https://topepo.github.io/caret/] package.

For now the functions implement randomForest, glmnet and plsda models, other model might work but I didn’t test them yet.

3.5.1 Screen models on your raw data

This function accepts either objects or a list containing a matrix of taxa for example and a dataframe with the variables to test.

res = screen_models(enterotype, model="glmnet", cores = 1, number = 3, repeats = 3)
#> [1] "Sample_ID : is constituted of unique observations"
#> Warning in caret::createDataPartition(data$y[[i]], list = F, p = 0.5): Some
#> classes have a single record ( AM.AD.1, AM.AD.2, AM.F10.T1, AM.F10.T2, DA.AD.1,
#> DA.AD.2, DA.AD.3, DA.AD.4, ES.AD.1, ES.AD.2, ES.AD.3, ES.AD.4, FR.AD.1,
#> FR.AD.2, FR.AD.3, FR.AD.4, FR.AD.5, FR.AD.6, FR.AD.7, FR.AD.8, IT.AD.1,
#> IT.AD.2, IT.AD.3, IT.AD.4, IT.AD.5, IT.AD.6, JP.AD.1, JP.AD.2, JP.AD.3,
#> JP.AD.4, JP.AD.5, JP.AD.6, JP.AD.7, JP.AD.8, JP.AD.9, JP.IN.1, JP.IN.2,
#> JP.IN.3, JP.IN.4 ) and these will be selected for the sample
#> Warning in caret::createDataPartition(data$y[[i]], list = F, p = 0.5): Some
#> classes have a single record ( CD, UC ) and these will be selected for the
#> sample

4 IgA seq analysis pipeline

You’ve performed all your sortings and sequencing, you now have three samples coming from a single individual.
We created a pipeline analysis where you will use the neg1 (9/10 of the neg fraction) and neg2 (1/10 of the neg fraction) dispersion (centered and reduced) to create a normal dispersion, using a Z approach we will have a Z score for the pos/neg1 based on the standard deviation of neg1/neg2.

Here we can see what the analysis will look like:

  • The technical dispersion is assessed using neg1/neg2for the log2 ratio and the neg1*neg2abundance for log10 (black dot)

  • The biological dispersion is assessed pos/neg1 for the log2 ratio and the pos*neg1 abundance for log10 (orange dot)

We will then take windows of X ASV (for example 20) to create n Gaussian curve and n …
The pipeline is based on three main functions that will call for other functions : seq_table, slide_z and collapse_IgAseq.

4.1 Make the seq_table list

First the seq_table function will take your phyloseq object and transform it to a list of data frames, one data frame for each samples coming from a single individual. For this function you will need to give physeq, sample_name corresponding to the name identifying the individual from which the sorted samples came, sorting_names the column where we find the samples such as : “sample1_pos”, “sample1_neg1”, “sample1_neg2”. Then the cols_to_keep that need to stay for now on “all”, it will collapse your sample_data in one column to allow you get it back later on.

The function will tell you if there is some samples are alone, if you have duplicated samples you need to sort them out or the rest of the pipeline will block. You can take a look at the architecture of the new object, you will find the ASV sequence as rownames, the taxonomy collapsed with “#” separator, the three samples having their own column and the sample_data collapse using also “#” separator. The function will only take the ASV that are present in the samples.

 data("igaseq")
sample_names(igaseq)= sample_data(igaseq)$sample_sort

seq.tab= seq_table(igaseq, sample_name = "sample_origin", sorting_names = "sample_sort", cols_to_keep = "all" )
#> otu_table has been transposed
#> sample MO308 is alone 
#> sample C1232 is alone 
#> sample C1283 is alone 
#> sample T2648 is alone

DT::datatable(seq$MO101, rownames = F)

The colnames will have the sample_names as given in the otu_table(), check that they correspond to your pos, neg1 and neg2.

Run the main function Now we will run the main function : slide_z. This function will take you seq_table object and run the Z function for each samples. If you are running this function for the first time use plot=T, if you already made the plots let it as FALSE it will make the loop a lot faster.

4.2 Deal with the zero values

In this approach we will use log2 ratio and log10 of abundances. As you know log2(0/x) or log2(x/0) can’t be performed, so we need a way to deal with the zeros. We came up with two approach :

  • remove all ASV with a zero value

  • replace 0 by a random number between 0 and the min value found in one of the three samples

  • replace 0 by the minimum count in each samples, this probably the worst thing to do

In sample with few ASV, i.e meconiums, I strongly recommend using the random_generation while in adults samples the no_zero approach is performing better. For more complex samples the decision is up to you, you can use the function neg_dipsersion to visualize the two outcomes.

neg_dispersion(seq.tab[[6]], positive_sorted_sample = "pos", negative_sorted_sample = "neg1", second_negative_sample = "neg2", type = "superposed")
#> Warning: Removed 2 rows containing missing values (`geom_point()`).
#> Warning: Removed 8 rows containing missing values (`geom_point()`).
#> Warning: Removed 1 rows containing missing values (`geom_point()`).


neg_dispersion(seq.tab[[6]], positive_sorted_sample = "pos", negative_sorted_sample = "neg1", second_negative_sample = "neg2",type = "facet all three")
#> Warning: Removed 2 rows containing missing values (`geom_point()`).
#> Warning: Removed 1 rows containing non-finite values (`stat_ellipse()`).
#> Warning: Removed 1 rows containing missing values (`geom_point()`).
#> Warning: Removed 8 rows containing missing values (`geom_point()`).

# run the following if you want every samples, make sure to change the number of cores
# pdf("test.pdf", width = 10, height = 7)
# mclapply(seq, neg_dispersion, mc.cores = 6, type="superposed")
# dev.off()

4.3 Run the slide_z function

The first step will be to create log2 ratios and log10 abundance (log10 abundance of pos * neg1 for example) for each ASV between the pos and the neg1 (log_2_ratio - log10_abundance) and between the neg1 and neg2 (log2_neg_ratio - log10_neg_abundance). This will be done by the function log_ratio called by the slide_z function.
The function will also create an ellipse of confidence interval of your choosing, default being confidence_interval=c(0.95,0.99, 0.999), this is done by the function ellipse_me also called by the `slide_z` function.

The output will be a list of S4 objects containing the following slots :

  • ig_seq_all : containing all the samples and all the ASV

  • ig_up : containing all the samples and all the ASV that significantly enriched in the IgA positive fraction

  • ig_down : containing all the samples and all the ASV that are significantly enriched in the IgA negative fraction

In each slot you will find the following columns :

  • taxonomy

  • sample_id

  • new : the sample_data collapsed

  • pos : positive fraction abundance

  • neg1 : negative (9/10) fraction abundance

  • neg2 : negative (1/10) fraction abundance

  • log10_abundance = log10(pos * neg1)

  • log2_ratio = log2(pos / neg1)

  • log10_neg_abundance = log10(neg1 * neg2)

  • log2_neg_ratio = log2(neg1 / neg2)

  • taxa = tax_table collapsed

  • SlideNorm = the normalized dispersion for each ASV

  • score = IgAseq score

  • ellipse_level = the level of confidence

IgA_seq=list()

system.time(for(i in names(seq.tab)){
  print(i)
  IgA_seq[[i]]= slide_z(seq.tab[[i]], positive_sorted_sample = "pos", negative_sorted_sample = "neg1", second_negative_sample = "neg2", deltaX = 30, slide_version = "slide_z_modern", alpha = 0.05, plot = F, zero_treatment = "random generation") 
})
#> [1] "MO101"
#> Le chargement a nécessité le package : htmlwidgets
#> Le chargement a nécessité le package : plotly
#> 
#> Attachement du package : 'plotly'
#> L'objet suivant est masqué depuis 'package:MASS':
#> 
#>     select
#> L'objet suivant est masqué depuis 'package:ComplexHeatmap':
#> 
#>     add_heatmap
#> L'objet suivant est masqué depuis 'package:ggplot2':
#> 
#>     last_plot
#> L'objet suivant est masqué depuis 'package:stats':
#> 
#>     filter
#> L'objet suivant est masqué depuis 'package:graphics':
#> 
#>     layout
#> 22 asv equal to 0 were replaced by a random generation with a max of  4.25
#> Le chargement a nécessité le package : car
#> Le chargement a nécessité le package : carData
#> 
#> Attachement du package : 'car'
#> L'objet suivant est masqué depuis 'package:dplyr':
#> 
#>     recode
#> L'objet suivant est masqué depuis 'package:purrr':
#> 
#>     some
#> Le chargement a nécessité le package : sp
#> [1] "C1101"
#> 23 asv equal to 0 were replaced by a random generation with a max of  4.46666666666667
#> [1] "MO537"
#> 28 asv equal to 0 were replaced by a random generation with a max of  2.33333333333333
#> [1] "C1526"
#> 7 asv equal to 0 were replaced by a random generation with a max of  4.93333333333333
#> [1] "C1537"
#> 2 asv equal to 0 were replaced by a random generation with a max of  14.1
#> [1] "MO536"
#> 29 asv equal to 0 were replaced by a random generation with a max of  9.73333333333334
#> [1] "C1040"
#> 12 asv equal to 0 were replaced by a random generation with a max of  3.2
#> [1] "MO040"
#> 21 asv equal to 0 were replaced by a random generation with a max of  6.48333333333333
#> [1] "T2616"
#> 4 asv equal to 0 were replaced by a random generation with a max of  6.76666666666666
#> [1] "MO042"
#> 42 asv equal to 0 were replaced by a random generation with a max of  6.3
#> [1] "MO078"
#> 27 asv equal to 0 were replaced by a random generation with a max of  3.48333333333333
#> [1] "MO137"
#> 28 asv equal to 0 were replaced by a random generation with a max of  3.5
#> [1] "MO147"
#> 62 asv equal to 0 were replaced by a random generation with a max of  5.5
#> [1] "MO154"
#> 40 asv equal to 0 were replaced by a random generation with a max of  5.4
#> [1] "MO177"
#> 47 asv equal to 0 were replaced by a random generation with a max of  3.33333333333333
#> [1] "MO183"
#> 39 asv equal to 0 were replaced by a random generation with a max of  7.2
#> [1] "MO219"
#> 39 asv equal to 0 were replaced by a random generation with a max of  0
#> [1] "MO248"
#> 25 asv equal to 0 were replaced by a random generation with a max of  5.61666666666667
#> [1] "MO249"
#> 43 asv equal to 0 were replaced by a random generation with a max of  3.33333333333333
#> [1] "MO253"
#> 25 asv equal to 0 were replaced by a random generation with a max of  4.61666666666667
#> [1] "MO267"
#> 20 asv equal to 0 were replaced by a random generation with a max of  10.8666666666667
#> [1] "MO294"
#> 38 asv equal to 0 were replaced by a random generation with a max of  6.16666666666667
#> [1] "MO371"
#> 29 asv equal to 0 were replaced by a random generation with a max of  10.4
#> [1] "MO380"
#> 23 asv equal to 0 were replaced by a random generation with a max of  7.2
#> [1] "MO381"
#> 39 asv equal to 0 were replaced by a random generation with a max of  0.4
#> [1] "MO391"
#> 14 asv equal to 0 were replaced by a random generation with a max of  10.7
#> [1] "MO406"
#> 35 asv equal to 0 were replaced by a random generation with a max of  4.53333333333333
#> [1] "MO408"
#> 24 asv equal to 0 were replaced by a random generation with a max of  0
#> [1] "C1499"
#> 3 asv equal to 0 were replaced by a random generation with a max of  11.3666666666667
#> [1] "MO446"
#> 56 asv equal to 0 were replaced by a random generation with a max of  7.2
#> [1] "MO447"
#> 16 asv equal to 0 were replaced by a random generation with a max of  6.51666666666667
#> [1] "C1455"
#> 18 asv equal to 0 were replaced by a random generation with a max of  5.98333333333333
#> [1] "MO495"
#> 32 asv equal to 0 were replaced by a random generation with a max of  0
#> [1] "MO510"
#> 28 asv equal to 0 were replaced by a random generation with a max of  2.66666666666667
#> [1] "C1579"
#> 13 asv equal to 0 were replaced by a random generation with a max of  5.76666666666667
#> [1] "C1078"
#> 12 asv equal to 0 were replaced by a random generation with a max of  2.13333333333334
#> [1] "C1105"
#> 14 asv equal to 0 were replaced by a random generation with a max of  10.8833333333333
#> [1] "C1108"
#> 13 asv equal to 0 were replaced by a random generation with a max of  7.6
#> [1] "C1137"
#> 16 asv equal to 0 were replaced by a random generation with a max of  1.51666666666667
#> [1] "C1154.2"
#> 16 asv equal to 0 were replaced by a random generation with a max of  2.53333333333333
#> [1] "C1170"
#> 16 asv equal to 0 were replaced by a random generation with a max of  6.6
#> [1] "C1177"
#> 24 asv equal to 0 were replaced by a random generation with a max of  4.73333333333333
#> [1] "C1248"
#> 12 asv equal to 0 were replaced by a random generation with a max of  2.01666666666667
#> [1] "C1253"
#> 8 asv equal to 0 were replaced by a random generation with a max of  6.86666666666667
#> [1] "C1294"
#> 17 asv equal to 0 were replaced by a random generation with a max of  3.3
#> [1] "C1325"
#> 14 asv equal to 0 were replaced by a random generation with a max of  4.25
#> [1] "C1338"
#> 9 asv equal to 0 were replaced by a random generation with a max of  4.6
#> [1] "C1371"
#> 7 asv equal to 0 were replaced by a random generation with a max of  8.83333333333333
#> [1] "C1381"
#> 14 asv equal to 0 were replaced by a random generation with a max of  11.4
#> [1] "C1406"
#> 25 asv equal to 0 were replaced by a random generation with a max of  6.33333333333333
#> [1] "C1442"
#> 10 asv equal to 0 were replaced by a random generation with a max of  11.5
#> [1] "C1510"
#> 23 asv equal to 0 were replaced by a random generation with a max of  3.33333333333333
#> [1] "C1605"
#> 13 asv equal to 0 were replaced by a random generation with a max of  1.73333333333333
#> [1] "C1608"
#> 17 asv equal to 0 were replaced by a random generation with a max of  11
#> [1] "C1617"
#> 10 asv equal to 0 were replaced by a random generation with a max of  0.75
#> [1] "C1636"
#> 32 asv equal to 0 were replaced by a random generation with a max of  15.2166666666667
#> [1] "C1645"
#> 5 asv equal to 0 were replaced by a random generation with a max of  4.43333333333333
#> [1] "T1063"
#> 23 asv equal to 0 were replaced by a random generation with a max of  3.46666666666667
#> [1] "T1147"
#> 12 asv equal to 0 were replaced by a random generation with a max of  9.58333333333333
#> [1] "T1183"
#> 21 asv equal to 0 were replaced by a random generation with a max of  4.73333333333333
#> [1] "T1267"
#> 24 asv equal to 0 were replaced by a random generation with a max of  2.66666666666667
#> [1] "T2183"
#> 17 asv equal to 0 were replaced by a random generation with a max of  6.9
#> [1] "T2547"
#> 16 asv equal to 0 were replaced by a random generation with a max of  7.38333333333333
#> [1] "T2554"
#> 14 asv equal to 0 were replaced by a random generation with a max of  6.73333333333333
#> [1] "T2423"
#> 7 asv equal to 0 were replaced by a random generation with a max of  1.1
#> [1] "MO608"
#> 21 asv equal to 0 were replaced by a random generation with a max of  2.66666666666667
#> [1] "MO617"
#> 22 asv equal to 0 were replaced by a random generation with a max of  3.33333333333333
#> [1] "MO635"
#> 37 asv equal to 0 were replaced by a random generation with a max of  3.23333333333333
#> [1] "MO636"
#> 10 asv equal to 0 were replaced by a random generation with a max of  0
#> [1] "MO645"
#> 16 asv equal to 0 were replaced by a random generation with a max of  8.2
#> [1] "MO423"
#> 35 asv equal to 0 were replaced by a random generation with a max of  1.48333333333333
#> [1] "C1213"
#> 0 asv equal to 0 were replaced by a random generation with a max of  12.4666666666667
#> [1] "C1277"
#> 5 asv equal to 0 were replaced by a random generation with a max of  10.0666666666667
#> [1] "C1388"
#> 25 asv equal to 0 were replaced by a random generation with a max of  5.53333333333333
#> [1] "T1441"
#> 3 asv equal to 0 were replaced by a random generation with a max of  15.2
#> [1] "T1547"
#> 14 asv equal to 0 were replaced by a random generation with a max of  3.16666666666667
#> [1] "T1423"
#> 10 asv equal to 0 were replaced by a random generation with a max of  0
#> [1] "T1515"
#> 19 asv equal to 0 were replaced by a random generation with a max of  5.4
#> [1] "T2063"
#> 12 asv equal to 0 were replaced by a random generation with a max of  6.46666666666667
#> [1] "T2267"
#> 11 asv equal to 0 were replaced by a random generation with a max of  3.33333333333333
#> [1] "T2441"
#> 2 asv equal to 0 were replaced by a random generation with a max of  17.7833333333333
#> [1] "T1616"
#> 9 asv equal to 0 were replaced by a random generation with a max of  5.66666666666667
#> utilisateur     système      écoulé 
#>       1.112       0.028       1.149

4.4 Collapse the list of IgAseq into a single S4 object

We now need to collapse the list of IgA_seq objects into a single one, just use collapse_IgAseq and separate the columns that were pasted together.

IgA_seq = collapse_IgAseq(IgA_seq)
# DT:: datatable(IgA_seq@ig_seq_all, rownames = F)
# View(IgA_seq)


IgA_all= IgA_seq@ig_seq_all  %>% 
  separate(taxonomy, into=c("Reign","Phylum", "Order","Class","Family", "Genus","Species","ASV", "rest"), sep="#") %>% 
  separate(col = new, into=colnames(sample_data(igaseq)),sep= "#" )

IgA_all$alpha= ifelse(IgA_all$score>1.96, "positively significative", ifelse(IgA_all$score< -1.96, "negatively significative", "not significative"))

dim(IgA_all)
#> [1] 4254   29

4.5 Plot like Gordon IgA seq

For the example we will plot the IgAseq data like Gordon’s paper. We first make a wilcoxon test to decipher which genera are significantly different from zero. Then we plot as balloon plot using ggplot.

library(rstatix)
#> 
#> Attachement du package : 'rstatix'
#> L'objet suivant est masqué depuis 'package:MASS':
#> 
#>     select
#> L'objet suivant est masqué depuis 'package:stats':
#> 
#>     filter
tmp= IgA_all %>% 
  group_by(Genus, donor)%>% 
   mutate(n=n())%>%
  filter(n>5 , Genus!="NA")%>%
  wilcox_test(score~1, mu=0, alternative ="two.sided", detailed = T)

tmp2= IgA_all %>%
  group_by(Genus, donor)%>%
  mutate(n=n())%>%
  filter(n>5, Genus!="NA")%>%
  dplyr::select(Genus, score, donor)%>%
    summarise(mean_score= mean(score))%>%
  left_join(tmp)
#> `summarise()` has grouped output by 'Genus'. You can override using the
#> `.groups` argument.
#> Joining with `by = join_by(Genus, donor)`
  alpha= ifelse(tmp2$p<0.05, -log10(tmp2$p), 0.5)
  
  tmp2$donor= factor(tmp2$donor, levels = c("child_delivery","child_2_months","child_24_months", "mother_24_months"))
  tmp2%>%
  ggplot(aes(donor, Genus))+
  geom_point(aes(size=alpha, fill=mean_score), alpha=alpha , shape=21)+
    scale_size_binned(range=c(0,10), trans = "log10")+
    scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0, breaks=c(-15,-10,-5,-2,1,0,1,2,5,10))+
    # scale_x_discrete(labels=c("Delivery", "2 months", "24 months", "Mothers"))+
    labs(fill="IgAseq mean score", x="Age", size="Level of significance")+
       theme(axis.text.y = element_text(size=15, face="bold"),
         axis.text.x=element_text(size=25, face="bold"),
         strip.text.y = element_text(angle = 0, size=15, face="bold"),
         axis.title.x = element_blank(), 
         axis.title.y = element_blank(),
         legend.position = 'bottom',
         legend.title = element_text(size=20, face="bold"), 
         legend.text = element_text(size=20, face="bold"), 
         legend.box = "vertical",
         legend.key.size = unit(2,"cm")
        )

4.6 Test models using the IgASeq data

5 Use multi-omics models based on mixOmics package